{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Semi-supervised Self-learning Classification with Iris Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.12.0\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import shutil\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "print(tf.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "TRAIN_URL = \"http://download.tensorflow.org/data/iris_training.csv\"\n",
    "TEST_URL = \"http://download.tensorflow.org/data/iris_test.csv\"\n",
    "train_path = tf.keras.utils.get_file(fname = TRAIN_URL.split('/')[-1], origin = TRAIN_URL)\n",
    "test_path = tf.keras.utils.get_file(fname = TEST_URL.split('/')[-1], origin = TEST_URL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ['TRAIN_PATH'] = train_path\n",
    "os.environ['TEST_PATH'] = test_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "120,4,setosa,versicolor,virginica\n",
      "6.4,2.8,5.6,2.2,2\n",
      "5.0,2.3,3.3,1.0,1\n",
      "4.9,2.5,4.5,1.7,2\n",
      "4.9,3.1,1.5,0.1,0\n",
      "5.7,3.8,1.7,0.3,0\n",
      "4.4,3.2,1.3,0.2,0\n",
      "5.4,3.4,1.5,0.4,0\n",
      "6.9,3.1,5.1,2.3,2\n",
      "6.7,3.1,4.4,1.4,1\n"
     ]
    }
   ],
   "source": [
    "!head ${TRAIN_PATH}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "30,4,setosa,versicolor,virginica\n",
      "5.9,3.0,4.2,1.5,1\n",
      "6.9,3.1,5.4,2.1,2\n",
      "5.1,3.3,1.7,0.5,0\n",
      "6.0,3.4,4.5,1.6,1\n",
      "5.5,2.5,4.0,1.3,1\n",
      "6.2,2.9,4.3,1.3,1\n",
      "5.5,4.2,1.4,0.2,0\n",
      "6.3,2.8,5.1,1.5,2\n",
      "5.6,3.0,4.1,1.3,1\n"
     ]
    }
   ],
   "source": [
    "!head ${TEST_PATH}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create fully supervised model for comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "CSV_COLUMN_NAMES = [\"SepalLength\", \"SepalWidth\", \"PetalLength\", \"PetalWidth\", \"Species\"]\n",
    "LABEL_COLUMN_NAME = \"Species\"\n",
    "SPECIES_NAMES = [\"Setosa\", \"Versicolor\", \"Virginica\"]\n",
    "CSV_COLUMN_DEFAULTS = [[0.0],[0.0],[0.0],[0.0],[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create an input function reading a file using the Dataset API\n",
    "# Then provide the results to the Estimator API\n",
    "def read_dataset(filename, mode, batch_size = 512):\n",
    "  def _input_fn():\n",
    "    def decode_csv(value_column):\n",
    "      columns = tf.decode_csv(records = value_column, record_defaults = CSV_COLUMN_DEFAULTS)\n",
    "      features = dict(zip(CSV_COLUMN_NAMES, columns))\n",
    "      label = features.pop(LABEL_COLUMN_NAME)\n",
    "      return features, label\n",
    "    \n",
    "    # Create list of files that match pattern\n",
    "    file_list = tf.gfile.Glob(filename = filename)\n",
    "\n",
    "    # Create dataset from file list\n",
    "    dataset = tf.data.TextLineDataset(filenames = file_list).skip(count = 1)  # Read text file\n",
    "    \n",
    "    dataset = dataset.map(map_func = decode_csv)  # Transform each elem by applying decode_csv fn\n",
    "      \n",
    "    if mode == tf.estimator.ModeKeys.TRAIN:\n",
    "      num_epochs = None # indefinitely\n",
    "      dataset = dataset.shuffle(buffer_size = 10 * batch_size)\n",
    "    else:\n",
    "      num_epochs = 1 # end-of-input after this\n",
    " \n",
    "    dataset = dataset.repeat(count = num_epochs)\n",
    "    dataset = dataset.batch(batch_size = batch_size)\n",
    "    return dataset\n",
    "  return _input_fn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define feature columns\n",
    "def create_feature_columns():\n",
    "  feature_columns = [tf.feature_column.numeric_column(key = feature) for feature in CSV_COLUMN_NAMES[0:-1]] # all features are numeric\n",
    "\n",
    "  return feature_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "SUPERVISED_MODEL_DIR = \"supervised_trained\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Using default config.\n",
      "INFO:tensorflow:Using config: {'_model_dir': 'supervised_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
      "graph_options {\n",
      "  rewrite_options {\n",
      "    meta_optimizer_iterations: ONE\n",
      "  }\n",
      "}\n",
      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x105c11438>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n"
     ]
    }
   ],
   "source": [
    "# Build a DNN with 2 hidden layers and 10 nodes in each hidden layer.\n",
    "supervised_estimator = tf.estimator.DNNClassifier(\n",
    "  feature_columns = create_feature_columns(),\n",
    "  model_dir = SUPERVISED_MODEL_DIR,\n",
    "  # Two hidden layers of 10 nodes each.\n",
    "  hidden_units = [10, 10],\n",
    "  # The model must choose between 3 classes.\n",
    "  n_classes = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Calling model_fn.\n",
      "INFO:tensorflow:Done calling model_fn.\n",
      "INFO:tensorflow:Create CheckpointSaverHook.\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n",
      "INFO:tensorflow:Saving checkpoints for 0 into supervised_trained/model.ckpt.\n",
      "INFO:tensorflow:loss = 50.350456, step = 1\n",
      "INFO:tensorflow:global_step/sec: 202.61\n",
      "INFO:tensorflow:loss = 4.913475, step = 101 (0.494 sec)\n",
      "INFO:tensorflow:global_step/sec: 297\n",
      "INFO:tensorflow:loss = 4.3145337, step = 201 (0.337 sec)\n",
      "INFO:tensorflow:global_step/sec: 273.068\n",
      "INFO:tensorflow:loss = 4.5109253, step = 301 (0.366 sec)\n",
      "INFO:tensorflow:global_step/sec: 303.565\n",
      "INFO:tensorflow:loss = 1.286738, step = 401 (0.329 sec)\n",
      "INFO:tensorflow:global_step/sec: 298.148\n",
      "INFO:tensorflow:loss = 1.6059611, step = 501 (0.335 sec)\n",
      "INFO:tensorflow:global_step/sec: 300.551\n",
      "INFO:tensorflow:loss = 1.0319146, step = 601 (0.333 sec)\n",
      "INFO:tensorflow:global_step/sec: 305.393\n",
      "INFO:tensorflow:loss = 4.351371, step = 701 (0.327 sec)\n",
      "INFO:tensorflow:global_step/sec: 298.63\n",
      "INFO:tensorflow:loss = 1.7432094, step = 801 (0.335 sec)\n",
      "INFO:tensorflow:global_step/sec: 286.438\n",
      "INFO:tensorflow:loss = 4.393015, step = 901 (0.349 sec)\n",
      "INFO:tensorflow:global_step/sec: 297.222\n",
      "INFO:tensorflow:loss = 1.6945345, step = 1001 (0.337 sec)\n",
      "INFO:tensorflow:global_step/sec: 296.086\n",
      "INFO:tensorflow:loss = 0.9381083, step = 1101 (0.337 sec)\n",
      "INFO:tensorflow:global_step/sec: 300.692\n",
      "INFO:tensorflow:loss = 1.8993287, step = 1201 (0.333 sec)\n",
      "INFO:tensorflow:global_step/sec: 306.668\n",
      "INFO:tensorflow:loss = 4.713934, step = 1301 (0.326 sec)\n",
      "INFO:tensorflow:global_step/sec: 283.518\n",
      "INFO:tensorflow:loss = 0.7597991, step = 1401 (0.353 sec)\n",
      "INFO:tensorflow:global_step/sec: 285.821\n",
      "INFO:tensorflow:loss = 0.9438435, step = 1501 (0.350 sec)\n",
      "INFO:tensorflow:global_step/sec: 298.196\n",
      "INFO:tensorflow:loss = 1.4582022, step = 1601 (0.335 sec)\n",
      "INFO:tensorflow:global_step/sec: 294.754\n",
      "INFO:tensorflow:loss = 0.91525185, step = 1701 (0.339 sec)\n",
      "INFO:tensorflow:global_step/sec: 299.15\n",
      "INFO:tensorflow:loss = 1.538285, step = 1801 (0.334 sec)\n",
      "INFO:tensorflow:global_step/sec: 300.322\n",
      "INFO:tensorflow:loss = 3.8142955, step = 1901 (0.333 sec)\n",
      "INFO:tensorflow:Saving checkpoints for 2000 into supervised_trained/model.ckpt.\n",
      "INFO:tensorflow:Loss for final step: 4.067023.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1048de358>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shutil.rmtree(path = SUPERVISED_MODEL_DIR, ignore_errors = True)  # start fresh each time\n",
    "supervised_estimator.train(\n",
    "  input_fn = read_dataset(\n",
    "    filename = train_path, \n",
    "    mode = tf.estimator.ModeKeys.TRAIN, \n",
    "    batch_size = 32), \n",
    "  steps = 2000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Calling model_fn.\n",
      "INFO:tensorflow:Done calling model_fn.\n",
      "INFO:tensorflow:Starting evaluation at 2019-03-15-09:17:42\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Restoring parameters from supervised_trained/model.ckpt-2000\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n",
      "INFO:tensorflow:Finished evaluation at 2019-03-15-09:17:43\n",
      "INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.96666664, average_loss = 0.05467449, global_step = 2000, loss = 1.6402347\n",
      "INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: supervised_trained/model.ckpt-2000\n"
     ]
    }
   ],
   "source": [
    "eval_metrics = supervised_estimator.evaluate(\n",
    "  input_fn = read_dataset(\n",
    "    filename = test_path, \n",
    "    mode = tf.estimator.ModeKeys.EVAL, \n",
    "    batch_size = 512), \n",
    "  steps = None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Now create semi-supervised model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SepalLength</th>\n",
       "      <th>SepalWidth</th>\n",
       "      <th>PetalLength</th>\n",
       "      <th>PetalWidth</th>\n",
       "      <th>Species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6.4</td>\n",
       "      <td>2.8</td>\n",
       "      <td>5.6</td>\n",
       "      <td>2.2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.0</td>\n",
       "      <td>2.3</td>\n",
       "      <td>3.3</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.9</td>\n",
       "      <td>2.5</td>\n",
       "      <td>4.5</td>\n",
       "      <td>1.7</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.7</td>\n",
       "      <td>3.8</td>\n",
       "      <td>1.7</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   SepalLength  SepalWidth  PetalLength  PetalWidth  Species\n",
       "0          6.4         2.8          5.6         2.2        2\n",
       "1          5.0         2.3          3.3         1.0        1\n",
       "2          4.9         2.5          4.5         1.7        2\n",
       "3          4.9         3.1          1.5         0.1        0\n",
       "4          5.7         3.8          1.7         0.3        0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "supervised_train_df = pd.read_csv(\n",
    "  filepath_or_buffer = train_path, \n",
    "  sep = ',', \n",
    "  header = 0, \n",
    "  names = CSV_COLUMN_NAMES)\n",
    "supervised_train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number_of_train_examples = 120\n"
     ]
    }
   ],
   "source": [
    "number_of_train_examples = len(supervised_train_df)\n",
    "print(\"number_of_train_examples = {}\".format(number_of_train_examples))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number_of_labeled_train_examples = 4 & number_of_unlabeled_train_examples = 116\n"
     ]
    }
   ],
   "source": [
    "number_of_labeled_train_examples = int(number_of_train_examples * 0.04)\n",
    "number_of_unlabeled_train_examples = number_of_train_examples - number_of_labeled_train_examples\n",
    "print(\"number_of_labeled_train_examples = {} & number_of_unlabeled_train_examples = {}\".format(number_of_labeled_train_examples, number_of_unlabeled_train_examples))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "semi_supervised_labeled_train_original_df = supervised_train_df[0:number_of_labeled_train_examples].reset_index(drop = True)\n",
    "semi_supervised_unlabeled_train_original_df = supervised_train_df[CSV_COLUMN_NAMES[0:-1]][number_of_labeled_train_examples:].reset_index(drop = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create semi-supervised model using sparse labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEMI_SUPERVISED_MODEL_DIR = \"semi_supervised_trained\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Using default config.\n",
      "INFO:tensorflow:Using config: {'_model_dir': 'semi_supervised_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
      "graph_options {\n",
      "  rewrite_options {\n",
      "    meta_optimizer_iterations: ONE\n",
      "  }\n",
      "}\n",
      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb3438a828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n"
     ]
    }
   ],
   "source": [
    "# Build a DNN with hidden layers and neurons in each hidden layer.\n",
    "semi_supervised_estimator = tf.estimator.DNNClassifier(\n",
    "  feature_columns = create_feature_columns(),\n",
    "  model_dir = SEMI_SUPERVISED_MODEL_DIR,\n",
    "  hidden_units = [10, 10],\n",
    "  n_classes = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SepalLength</th>\n",
       "      <th>SepalWidth</th>\n",
       "      <th>PetalLength</th>\n",
       "      <th>PetalWidth</th>\n",
       "      <th>Species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6.4</td>\n",
       "      <td>2.8</td>\n",
       "      <td>5.6</td>\n",
       "      <td>2.2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.0</td>\n",
       "      <td>2.3</td>\n",
       "      <td>3.3</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.9</td>\n",
       "      <td>2.5</td>\n",
       "      <td>4.5</td>\n",
       "      <td>1.7</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   SepalLength  SepalWidth  PetalLength  PetalWidth  Species\n",
       "0          6.4         2.8          5.6         2.2        2\n",
       "1          5.0         2.3          3.3         1.0        1\n",
       "2          4.9         2.5          4.5         1.7        2\n",
       "3          4.9         3.1          1.5         0.1        0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "semi_supervised_labeled_train_original_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "confidence_threshold = 0.95"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "loop_counter = 0, number_of_labeled_examples = 4, number_of_unlabeled_examples = 116\n",
      "\n",
      "WARNING:tensorflow:From /Users/ryangillard/anaconda3/lib/python3.6/site-packages/tensorflow/python/estimator/inputs/queues/feeding_queue_runner.py:62: QueueRunner.__init__ (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "To construct input pipelines, use the `tf.data` module.\n",
      "WARNING:tensorflow:From /Users/ryangillard/anaconda3/lib/python3.6/site-packages/tensorflow/python/estimator/inputs/queues/feeding_functions.py:500: add_queue_runner (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "To construct input pipelines, use the `tf.data` module.\n",
      "INFO:tensorflow:Calling model_fn.\n",
      "INFO:tensorflow:Done calling model_fn.\n",
      "INFO:tensorflow:Create CheckpointSaverHook.\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n",
      "WARNING:tensorflow:From /Users/ryangillard/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py:804: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "To construct input pipelines, use the `tf.data` module.\n",
      "INFO:tensorflow:Saving checkpoints for 0 into semi_supervised_trained/model.ckpt.\n",
      "INFO:tensorflow:loss = 34.139137, step = 1\n",
      "INFO:tensorflow:global_step/sec: 268.311\n",
      "INFO:tensorflow:loss = 3.249683, step = 101 (0.376 sec)\n",
      "INFO:tensorflow:global_step/sec: 491.652\n",
      "INFO:tensorflow:loss = 0.5667273, step = 201 (0.202 sec)\n",
      "INFO:tensorflow:global_step/sec: 487.047\n",
      "INFO:tensorflow:loss = 0.32575104, step = 301 (0.207 sec)\n",
      "INFO:tensorflow:global_step/sec: 528.913\n",
      "INFO:tensorflow:loss = 0.19807765, step = 401 (0.189 sec)\n",
      "INFO:tensorflow:global_step/sec: 502.451\n",
      "INFO:tensorflow:loss = 0.083514765, step = 501 (0.198 sec)\n",
      "INFO:tensorflow:global_step/sec: 484.882\n",
      "INFO:tensorflow:loss = 0.07296247, step = 601 (0.205 sec)\n",
      "INFO:tensorflow:global_step/sec: 514.38\n",
      "INFO:tensorflow:loss = 0.037595317, step = 701 (0.195 sec)\n",
      "INFO:tensorflow:global_step/sec: 526.565\n",
      "INFO:tensorflow:loss = 0.050383218, step = 801 (0.188 sec)\n",
      "INFO:tensorflow:global_step/sec: 518.997\n",
      "INFO:tensorflow:loss = 0.0522907, step = 901 (0.195 sec)\n",
      "INFO:tensorflow:global_step/sec: 547.136\n",
      "INFO:tensorflow:loss = 0.044426173, step = 1001 (0.180 sec)\n",
      "INFO:tensorflow:global_step/sec: 514.928\n",
      "INFO:tensorflow:loss = 0.031139556, step = 1101 (0.196 sec)\n",
      "INFO:tensorflow:global_step/sec: 503.641\n",
      "INFO:tensorflow:loss = 0.026995052, step = 1201 (0.198 sec)\n",
      "INFO:tensorflow:global_step/sec: 493.474\n",
      "INFO:tensorflow:loss = 0.027659303, step = 1301 (0.203 sec)\n",
      "INFO:tensorflow:global_step/sec: 513.6\n",
      "INFO:tensorflow:loss = 0.033336066, step = 1401 (0.193 sec)\n",
      "INFO:tensorflow:global_step/sec: 501.918\n",
      "INFO:tensorflow:loss = 0.024612999, step = 1501 (0.200 sec)\n",
      "INFO:tensorflow:global_step/sec: 516.02\n",
      "INFO:tensorflow:loss = 0.027367074, step = 1601 (0.194 sec)\n",
      "INFO:tensorflow:global_step/sec: 502.459\n",
      "INFO:tensorflow:loss = 0.01730816, step = 1701 (0.198 sec)\n",
      "INFO:tensorflow:global_step/sec: 524.117\n",
      "INFO:tensorflow:loss = 0.013451718, step = 1801 (0.192 sec)\n",
      "INFO:tensorflow:global_step/sec: 513.774\n",
      "INFO:tensorflow:loss = 0.017365355, step = 1901 (0.196 sec)\n",
      "INFO:tensorflow:Saving checkpoints for 2000 into semi_supervised_trained/model.ckpt.\n",
      "INFO:tensorflow:Loss for final step: 0.01110098.\n",
      "INFO:tensorflow:Calling model_fn.\n",
      "INFO:tensorflow:Done calling model_fn.\n",
      "INFO:tensorflow:Starting evaluation at 2019-03-15-09:17:49\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Restoring parameters from semi_supervised_trained/model.ckpt-2000\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n",
      "INFO:tensorflow:Finished evaluation at 2019-03-15-09:17:50\n",
      "INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.93333334, average_loss = 0.16812083, global_step = 2000, loss = 5.043625\n",
      "INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: semi_supervised_trained/model.ckpt-2000\n",
      "INFO:tensorflow:Calling model_fn.\n",
      "INFO:tensorflow:Done calling model_fn.\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Restoring parameters from semi_supervised_trained/model.ckpt-2000\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n",
      "\n",
      "loop_counter = 1, number_of_labeled_examples = 104, number_of_unlabeled_examples = 16\n",
      "\n",
      "INFO:tensorflow:Calling model_fn.\n",
      "INFO:tensorflow:Done calling model_fn.\n",
      "INFO:tensorflow:Create CheckpointSaverHook.\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Restoring parameters from semi_supervised_trained/model.ckpt-2000\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n",
      "INFO:tensorflow:Saving checkpoints for 2000 into semi_supervised_trained/model.ckpt.\n",
      "INFO:tensorflow:loss = 0.07429041, step = 2001\n",
      "INFO:tensorflow:global_step/sec: 266.256\n",
      "INFO:tensorflow:loss = 0.19109085, step = 2101 (0.380 sec)\n",
      "INFO:tensorflow:global_step/sec: 513.315\n",
      "INFO:tensorflow:loss = 0.03046345, step = 2201 (0.192 sec)\n",
      "INFO:tensorflow:global_step/sec: 528.958\n",
      "INFO:tensorflow:loss = 0.0803492, step = 2301 (0.190 sec)\n",
      "INFO:tensorflow:global_step/sec: 528.815\n",
      "INFO:tensorflow:loss = 0.02711267, step = 2401 (0.188 sec)\n",
      "INFO:tensorflow:global_step/sec: 525.912\n",
      "INFO:tensorflow:loss = 0.0621346, step = 2501 (0.191 sec)\n",
      "INFO:tensorflow:global_step/sec: 494.149\n",
      "INFO:tensorflow:loss = 0.03942996, step = 2601 (0.201 sec)\n",
      "INFO:tensorflow:global_step/sec: 517.902\n",
      "INFO:tensorflow:loss = 0.055636443, step = 2701 (0.195 sec)\n",
      "INFO:tensorflow:global_step/sec: 506.771\n",
      "INFO:tensorflow:loss = 0.012852859, step = 2801 (0.197 sec)\n",
      "INFO:tensorflow:global_step/sec: 497.057\n",
      "INFO:tensorflow:loss = 0.043630213, step = 2901 (0.201 sec)\n",
      "INFO:tensorflow:global_step/sec: 531.222\n",
      "INFO:tensorflow:loss = 0.048151374, step = 3001 (0.187 sec)\n",
      "INFO:tensorflow:global_step/sec: 513.418\n",
      "INFO:tensorflow:loss = 0.024122152, step = 3101 (0.196 sec)\n",
      "INFO:tensorflow:global_step/sec: 462.776\n",
      "INFO:tensorflow:loss = 0.012519169, step = 3201 (0.216 sec)\n",
      "INFO:tensorflow:global_step/sec: 492.324\n",
      "INFO:tensorflow:loss = 0.011430501, step = 3301 (0.203 sec)\n",
      "INFO:tensorflow:global_step/sec: 525.713\n",
      "INFO:tensorflow:loss = 0.029876059, step = 3401 (0.189 sec)\n",
      "INFO:tensorflow:global_step/sec: 517.499\n",
      "INFO:tensorflow:loss = 0.026909253, step = 3501 (0.195 sec)\n",
      "INFO:tensorflow:global_step/sec: 518.834\n",
      "INFO:tensorflow:loss = 0.013660897, step = 3601 (0.193 sec)\n",
      "INFO:tensorflow:global_step/sec: 465.732\n",
      "INFO:tensorflow:loss = 0.029427981, step = 3701 (0.213 sec)\n",
      "INFO:tensorflow:global_step/sec: 465.864\n",
      "INFO:tensorflow:loss = 0.0075616194, step = 3801 (0.216 sec)\n",
      "INFO:tensorflow:global_step/sec: 514.401\n",
      "INFO:tensorflow:loss = 0.028024865, step = 3901 (0.193 sec)\n",
      "INFO:tensorflow:Saving checkpoints for 4000 into semi_supervised_trained/model.ckpt.\n",
      "INFO:tensorflow:Loss for final step: 0.003191098.\n",
      "INFO:tensorflow:Calling model_fn.\n",
      "INFO:tensorflow:Done calling model_fn.\n",
      "INFO:tensorflow:Starting evaluation at 2019-03-15-09:17:56\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Restoring parameters from semi_supervised_trained/model.ckpt-4000\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n",
      "INFO:tensorflow:Finished evaluation at 2019-03-15-09:17:56\n",
      "INFO:tensorflow:Saving dict for global step 4000: accuracy = 0.93333334, average_loss = 0.2034376, global_step = 4000, loss = 6.103128\n",
      "INFO:tensorflow:Saving 'checkpoint_path' summary for global step 4000: semi_supervised_trained/model.ckpt-4000\n",
      "INFO:tensorflow:Calling model_fn.\n",
      "INFO:tensorflow:Done calling model_fn.\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Restoring parameters from semi_supervised_trained/model.ckpt-4000\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n"
     ]
    }
   ],
   "source": [
    "shutil.rmtree(path = SEMI_SUPERVISED_MODEL_DIR, ignore_errors = True) # start fresh each time\n",
    "\n",
    "semi_supervised_labeled_train_df = semi_supervised_labeled_train_original_df.copy(deep = True)\n",
    "semi_supervised_unlabeled_train_df = semi_supervised_unlabeled_train_original_df.copy(deep = True)\n",
    "\n",
    "unlabed_confident_indices = np.zeros([1])\n",
    "\n",
    "accuracy = 0.000001\n",
    "old_accuracy = 0.0\n",
    "\n",
    "loop_counter = 0\n",
    "while len(semi_supervised_unlabeled_train_df) > 0 and unlabed_confident_indices.shape[0] > 0 and accuracy > old_accuracy:\n",
    "  print(\"\\nloop_counter = {}, number_of_labeled_examples = {}, number_of_unlabeled_examples = {}\\n\".format(loop_counter, len(semi_supervised_labeled_train_df), len(semi_supervised_unlabeled_train_df)))\n",
    "  # Train on currently labeled data\n",
    "  train_input_fn = tf.estimator.inputs.pandas_input_fn(\n",
    "    x = semi_supervised_labeled_train_df, \n",
    "    y = semi_supervised_labeled_train_df[LABEL_COLUMN_NAME], \n",
    "    batch_size = 32, \n",
    "    num_epochs = None, \n",
    "    shuffle = True)\n",
    "\n",
    "  semi_supervised_estimator.train(\n",
    "    input_fn = train_input_fn, \n",
    "    steps = 2000)\n",
    "\n",
    "\n",
    "  # Check evaluation metrics on held out evaluation set now that training is over\n",
    "  eval_metrics = semi_supervised_estimator.evaluate(\n",
    "    input_fn = read_dataset(\n",
    "      filename = test_path, \n",
    "      mode = tf.estimator.ModeKeys.EVAL, \n",
    "      batch_size = 512), \n",
    "    steps = None)\n",
    "  \n",
    "  old_accuracy = accuracy\n",
    "  accuracy = eval_metrics[\"accuracy\"]\n",
    "\n",
    "  # Now predict from the unlabeled set\n",
    "  predict_input_fn = tf.estimator.inputs.pandas_input_fn(\n",
    "    x = semi_supervised_unlabeled_train_df, \n",
    "    y = None, \n",
    "    batch_size = 512, \n",
    "    num_epochs = 1, \n",
    "    shuffle = False)\n",
    "\n",
    "  predictions = [prediction \n",
    "                 for prediction in semi_supervised_estimator.predict(\n",
    "                   input_fn = predict_input_fn)]\n",
    "\n",
    "  # Get the probabilities and class ids from the prediction list generated from the estimator\n",
    "  probabilities = np.array(object = [prediction[\"probabilities\"] \n",
    "                                     for prediction in predictions])\n",
    "  class_ids = np.array(object = [prediction[\"class_ids\"] \n",
    "                                 for prediction in predictions])\n",
    "\n",
    "  # Check if our predictions are above the confidence threshold\n",
    "  confidence_condition = np.amax(a = probabilities, axis = 1) > confidence_threshold\n",
    "\n",
    "  # Get the indices of both the confident and unconfident unlabeled predictions so that we can slice our unlabeled dataframe\n",
    "  unlabed_confident_indices = np.where(confidence_condition)[0]\n",
    "  unlabed_unconfident_indices = np.where(~confidence_condition)[0]\n",
    "\n",
    "  # Get the class ids of the confident unlabeled predictions\n",
    "  unlabed_confident_class_ids = np.squeeze(a = class_ids[confidence_condition], axis = 1)\n",
    "\n",
    "  # Create dataframe of the confidently prediction examples combining their features with the predicted class id\n",
    "  new_labeled_df = semi_supervised_unlabeled_train_df.loc[unlabed_confident_indices]\n",
    "  new_labeled_df[LABEL_COLUMN_NAME] = unlabed_confident_class_ids\n",
    "\n",
    "  semi_supervised_labeled_train_df = pd.concat(\n",
    "    objs = [semi_supervised_labeled_train_df, new_labeled_df], \n",
    "    axis = 0).reset_index(drop = True)\n",
    "\n",
    "  # Remove the confident predictions leaving only the unconfident predictions to go another round through the loop\n",
    "  semi_supervised_unlabeled_train_df = semi_supervised_unlabeled_train_df.loc[unlabed_unconfident_indices].reset_index(drop = True)\n",
    "  \n",
    "  loop_counter += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
